%---------------------------------------------------------------------------%
%-                                                                         -%
%-                             Bibliography                                -%
%-                                                                         -%
%---------------------------------------------------------------------------%
@article{chen2016eyeriss,
  title={Eyeriss: An energy-efficient reconfigurable accelerator for deep convolutional neural networks},
  author={Chen, Yu-Hsin and Krishna, Tushar and Emer, Joel S and Sze, Vivienne},
  journal={IEEE journal of solid-state circuits},
  volume={52},
  number={1},
  pages={127--138},
  year={2016},
  publisher={IEEE}
}

@inproceedings{farabet2009cnp,
  title={Cnp: An fpga-based processor for convolutional networks},
  author={Farabet, Cl{\'e}ment and Poulet, Cyril and Han, Jefferson Y and LeCun, Yann},
  booktitle={2009 International Conference on Field Programmable Logic and Applications},
  pages={32--37},
  year={2009},
  organization={IEEE}
}

@inproceedings{peemen2013memory,
  title={Memory-centric accelerator design for convolutional neural networks},
  author={Peemen, Maurice and Setio, Arnaud AA and Mesman, Bart and Corporaal, Henk},
  booktitle={2013 IEEE 31st International Conference on Computer Design (ICCD)},
  pages={13--19},
  year={2013},
  organization={IEEE}
}

@article{chen2014diannao,
  title={Diannao: A small-footprint high-throughput accelerator for ubiquitous machine-learning},
  author={Chen, Tianshi and Du, Zidong and Sun, Ninghui and Wang, Jia and Wu, Chengyong and Chen, Yunji and Temam, Olivier},
  journal={ACM SIGARCH Computer Architecture News},
  volume={42},
  number={1},
  pages={269--284},
  year={2014},
  publisher={ACM New York, NY, USA}
}

@article{waterman2011risc,
  title={The risc-v instruction set manual, volume i: Base user-level isa},
  author={Waterman, Andrew and Lee, Yunsup and Patterson, David A and Asanovic, Krste},
  journal={EECS Department, UC Berkeley, Tech. Rep. UCB/EECS-2011-62},
  volume={116},
  year={2011}
}

@article{asanovic2014instruction,
  title={Instruction sets should be free: The case for risc-v},
  author={Asanovi{\'c}, Krste and Patterson, David A},
  journal={EECS Department, University of California, Berkeley, Tech. Rep. UCB/EECS-2014-146},
  year={2014}
}

@article{lecun1998gradient,
  title={Gradient-based learning applied to document recognition},
  author={LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and Haffner, Patrick},
  journal={Proceedings of the IEEE},
  volume={86},
  number={11},
  pages={2278--2324},
  year={1998},
  publisher={Ieee}
}

@inproceedings{redmon2016you,
  title={You only look once: Unified, real-time object detection},
  author={Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={779--788},
  year={2016}
}

@inproceedings{redmon2017yolo9000,
  title={YOLO9000: better, faster, stronger},
  author={Redmon, Joseph and Farhadi, Ali},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7263--7271},
  year={2017}
}

@article{redmon2018yolov3,
  title={Yolov3: An incremental improvement},
  author={Redmon, Joseph and Farhadi, Ali},
  journal={arXiv preprint arXiv:1804.02767},
  year={2018}
}

@article{garofaloPULPNNAcceleratingQuantized2020,
  title = {{{PULP}}-{{NN}}: Accelerating Quantized Neural Networks on Parallel Ultra-Low-Power {{RISC}}-{{V}} Processors},
  shorttitle = {{{PULP}}-{{NN}}},
  author = {Garofalo, Angelo and Rusci, Manuele and Conti, Francesco and Rossi, Davide and Benini, Luca},
  year = {2020},
  month = feb,
  volume = {378},
  pages = {20190155},
  issn = {1364-503X, 1471-2962},
  doi = {10.1098/rsta.2019.0155},
  abstract = {We present PULP-NN, an optimized computing library for a parallel ultra-low-power tightly coupled cluster of RISC-V processors. The key innovation in PULP-NN is a set of kernels for quantized neural network inference, targeting byte and sub-byte data types, down to INT-1, tuned for the recent trend toward aggressive quantization in deep neural network inference. The proposed library exploits both the digital signal processing extensions available in the PULP RISC-V processors and the cluster's parallelism, achieving up to 15.5\,MACs/cycle on INT-8 and improving performance by up to 63\,\texttimes{} with respect to a sequential implementation on a single RISC-V core implementing the baseline RV32IMC ISA. Using PULP-NN, a CIFAR-10 network on an octa-core cluster runs in 30\,\texttimes{} and 19.6\,\texttimes{} less clock cycles than the current state-of-the-art ARM CMSIS-NN library, running on STM32L4 and STM32H7 MCUs, respectively. The proposed library, when running on a GAP-8 processor, outperforms by 36.8\,\texttimes{} and by 7.45\,\texttimes{} the execution on energy efficient MCUs such as STM32L4 and high-end MCUs such as STM32H7 respectively, when operating at the maximum frequency. The energy efficiency on GAP-8 is 14.1\,\texttimes{} higher than STM32L4 and 39.5\,\texttimes{} higher than STM32H7, at the maximum efficiency operating point.
            This article is part of the theme issue `Harmonizing energy-autonomous computing and intelligence'.},
  journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences},
  language = {en},
  number = {2164}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{everingham2007pascal,
  title={The PASCAL visual object classes challenge 2007 (VOC2007) results},
  author={Everingham, Mark and Van Gool, Luc and Williams, Christopher KI and Winn, John and Zisserman, Andrew},
  year={2007}
}

@inproceedings{nakahara2018lightweight,
  title={A lightweight yolov2: A binarized cnn with a parallel support vector regression for an fpga},
  author={Nakahara, Hiroki and Yonekawa, Haruyoshi and Fujii, Tomoya and Sato, Shimpei},
  booktitle={Proceedings of the 2018 ACM/SIGDA International Symposium on field-programmable gate arrays},
  pages={31--40},
  year={2018}
}

@misc{xilinx2015zynq,
  title={Zynq-7000 All Programmable SoC: Technical Reference Manual},
  author={Xilinx, UG585},
  year={2015},
  publisher={February}
}

@misc{PYNQ_MB,
title = {PYNQ MicroBlaze Subsystem},
year={2018},
howpublished = {\url{https://pynq.readthedocs.io/en/latest/overlay_design_methodology/pynq_microblaze_subsystem.html}}
}

@book{胡振波2018手把手教你设计,
  title={手把手教你设计 CPU: RISC-V 处理器篇},
  author={胡振波},
  year={2018},
  publisher={人民邮电出版社}
}

@phdthesis{陈辰2019基于,
  title={基于FPGA的神经网络加速器的规模可伸缩性研究},
  author={陈辰},
  school={江南大学},
  year={2019},
}
@phdthesis{杨维科2018基于,
  title={基于RISC-V开源处理器的卷积神经网络加速器设计方法研究},
  author={杨维科},
  school={上海交通大学},
  year={2018},
 abstract={近年来,随着深度学习的发展,卷积神经网络(Convolutional Neural Network,CNN)的硬件加速逐渐成为了研究的热门问题.可重构的加速器与通用CPU相结合的模式,既有通用性,又有针对具体问题情景的优化,成为一种高效地解决卷积神经网络加速问题的方案.但是,一方面商用CPU的专利授权费日益高昂,另一方面商用CPU与加速器各自的开发流程不兼容,有着开发流程复杂等问题,因此以RISC-V为代表的开源处理器+加速器的模式并结合全自动化设计方法,有助于更高效地进行CNN加速平台设计.为此,本文提出了一种基于RISC-V开源处理器的卷积神经网络加速结构及其硬件设计.本文首先通过对现有加速器结构,如加法树结构,脉动阵列结构,Eyeriss结构等进行分析,选取了综合性能较好的Eyeriss结构作为基础,之后在单个处理单元(Process Element,PE)内部,PE阵列的结构,PE阵列之间的并行,系统软硬件划分等四个层面进行研究.在单个PE层面,为了减少PE单元内部数据的移动,本文采用了维护循环数组指针的方式,提高了PE单元内部的运算效率.在PE阵列的层面,本文针对Eyeriss结构在卷积神经网络运行后期容易造成PE资源浪费的问题,提出了一种尺寸自适应的加速结构,有效提高了PE资源的利用率,进而对卷积神经网络起到了加速作用,此外也针对不同网络以及卷积层可能存在的Stride不同的情况进行了优化.在PE阵列之间并行的层面,本文利用输入输出混合并行的思想,分析了基于输入特征图和输出特征图的并行结构对带宽,缓存等的要求,最终在网络结构不改变的情况下设计了2×1×2的并行加速结构,在控制访存带宽的同时提高了加速效果.在系统软硬件划分方面,为了提高本文设计的灵活性,对系统的控制逻辑等进行了适当的划分,提高了系统对不同网络结构的适应性.本文在Rocket-Chip Emulator中对设计进行仿真,在Vivado软件中进行仿真和综合,测试结果表明:在使用本文结构的情况下前向流程的周期数缩减为串行的19.46%.相比普通Eyeriss结构,本文效果提高了22.3%.引入了输入输出特征图的并行结构后,一张图完成前向流程的周期数缩减为串行结构的11.6%,相比普通Eyeriss结构,本文效果提高了13.01%.实验结果验证了该结构在加速卷积神经网络方面的有效性,同时综合结果也说明硬件资源的消耗在可以接受的范围内.},
}

@article{贠晨阳基于,
  title={基于PicoRV32开源处理器的SOC平台搭建Construction of SOC platform based on PicoRV32 open source processor},
  author={贠晨阳 and 苗瑞霞},
  journal={现代电子技术},
  volume={v.42;No.548},
  number={21},
  pages={90-93},
  year={2019},
 abstract={由于现有的处理器架构及IP核存在授权费用高、兼容性差等问题,近两年出现的新型RISC-V架构有着开源、免费等优势,文中基于RISC-V指令集的PicoRV32开源处理器,搭建一个精简SOC硬件平台。通过运行呼吸灯测试程序,验证了该平台的正确性。在Xilinx XCVU440的FPGA开发板下资源显示最高频率为381.2 MHz,LUTs为1 137。可见PicoRV32开源微处理器具有逻辑门数少、跑频高的优点,可被用作FPGA设计和ASIC设计的辅助处理器,具有较高的研究价值和应用前景,并且所设计的平台可适用于其他处理器的SOC搭建及FPGA综合验证。},
}

@article{雷思磊2017RISC,
  title={RISC—V架构的开源处理器及SoC研究综述},
  author={雷思磊},
  journal={单片机与嵌入式系统应用},
  number={2},
  pages={56-60,76共6页},
  year={2017},
 keywords={RISC—V;Rocket;BOOM;SOC},
 abstract={RISC—V是一种新的指令集架构,发布以来得到了大量关注,在描述了RISC～V的产生背景,基本设计的基础上,简单比较了其与现有的开源指令集架构,商业指令集架构的优劣,然后详细介绍了现有的采用RISC—V架构的开源处理器,开源SoC,并展望了RISC—V的未来发展.},
}

@article{zhangJiYuRISCVChuLiQiDeJuanJiJiaSuSoC,
  title = {{基于RISC-V处理器的卷积加速SoC}},
  author = {张, 坤宁 and 赵, 烁 and 何, 虎 and 邓, 宁 and 杨, 旭},
  pages = {1--6},
  issn = {1000-3428},
  abstract = {为了解决卷积神经网络计算效率和能效较低的问题，设计了一个以8bits定点数据作为输入的卷积加速器。通过优化循环计算顺序，并与数据复用技术相结合，显著提高了卷积计算效率。此外加速器支持激活、批标准化（BN）以及池化等CNN网络中的常见计算类型，令加速器的功能更加完善。基于软硬件协同设计思想，还设计了包含RISC-V处理器和卷积加速器的SoC系统。RISC-V处理器基于开源的指令集标准，可以根据具体的设计需求扩展指令功能。将SoC系统部署在Xilinx ZCU102开发板上，RISC-V处理器和加速器分别工作在100MHz和300MHz的频率下，加速器的算力达到了153.6GOP/s，运行VGG1...},
  journal = {计算机工程},
  language = {中文}
}

@article{zhangJiYuXiaoXingZynqSoCYingJianJiaSuDeGaiJinTINYYOLOShiShiCheLiangJianCeSuanFaShiXian2019,
  title = {{基于小型Zynq SoC硬件加速的改进TINY YOLO实时车辆检测算法实现}},
  author = {张, 雲轲 and 刘, 丹},
  year = {2019},
  volume = {39},
  pages = {192--198},
  issn = {1001-9081},
  abstract = {针对TINY YOLO车辆检测算法计算量过大,且在小型嵌入式系统中难以达到实时检测要求的问题。利用小型Zynq So C系统的架构优势以及TINY YOLO的网络权值中存在大量接近零的权值参数这一特点,提出硬件并行加速的改进算法,称为浓缩小型深度网络(Xerantic-TINY YOLO,X-TINY YOLO)车辆检测算法。首先对TINY YOLO中网络结构进行压缩;其次采用高效多级流水线、流水线内全并行的方式对卷积计算部分进行算法加速;最后提出与网络结构相配合的数据切割和传输方案。实验结果表明,X-TINY YOLO仅消耗50\%的片内硬件资源,可在相对于GPU和CPU性价比更高更适合嵌入式...},
  journal = {计算机应用},
  language = {中文;},
  number = {01}
}

%---------------------------------------------------------------------------%
